EDE2 - Semester Project¶

Time Series Analysis - Solar Power Generation in Germany¶

Student name:

  • Fjolla Fazliu,
  • Akin Tarakci,
  • Saeed Sohbatloo

Solar Power Plant Distribution

Data Preparation¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

import warnings
warnings.filterwarnings('ignore')
In [2]:
input_paths = [
    "energy-charts_Total_net_electricity_generation_in_Germany_in_2021.csv",
    "energy-charts_Total_net_electricity_generation_in_Germany_in_2022.csv",
    "energy-charts_Total_net_electricity_generation_in_Germany_in_2023.csv",
    "energy-charts_Total_net_electricity_generation_in_Germany_in_2024.csv",
    "energy-charts_Total_net_electricity_generation_in_Germany_in_2025.csv",
]
In [3]:
# Load all files into separate DataFrames with low_memory=False 
dfs = [pd.read_csv(path, low_memory=False) for path in input_paths]

# Concatenate all DataFrames into one
df_all = pd.concat(dfs, ignore_index=True)
df_all = df_all.sort_index()
In [4]:
# Select only the columns "Date (GMT+1)" and "Solar"
df_solar = df_all[["Date (GMT+1)", "Solar"]]

# Optional: Clean column names (strip any leading/trailing spaces)
df_solar.columns = df_solar.columns.str.strip()

# Optional: Check the result
print(df_solar.head())
             Date (GMT+1)       Solar
0                     NaN  Power (MW)
1  2021-01-01T00:00+01:00           0
2  2021-01-01T00:15+01:00           0
3  2021-01-01T00:30+01:00           0
4  2021-01-01T00:45+01:00           0
In [5]:
# Load your data (if not already loaded)
# df_solar = pd.read_csv('your_file.csv')

# Remove any accidental spaces from column names
df_solar.columns = df_solar.columns.str.strip()

# Rename columns
df_solar = df_solar.rename(columns={"Date (GMT+1)": "Date-Time", "Solar": "Solar-Power-MW"})

# Convert 'Date-Time' to datetime
df_solar["Date-Time"] = pd.to_datetime(df_solar["Date-Time"], utc=True)

# Remove timezone
df_solar["Date-Time"] = df_solar["Date-Time"].dt.tz_localize(None)

# Make 'Solar-Power-MW' numeric
df_solar["Solar-Power-MW"] = pd.to_numeric(df_solar["Solar-Power-MW"], errors="coerce")

# Set Date-Time as index
df_solar = df_solar.set_index('Date-Time')

# Check
print(df_solar.head())
print(df_solar.index)
                     Solar-Power-MW
Date-Time                          
NaT                             NaN
2020-12-31 23:00:00             0.0
2020-12-31 23:15:00             0.0
2020-12-31 23:30:00             0.0
2020-12-31 23:45:00             0.0
DatetimeIndex([                'NaT', '2020-12-31 23:00:00',
               '2020-12-31 23:15:00', '2020-12-31 23:30:00',
               '2020-12-31 23:45:00', '2021-01-01 00:00:00',
               '2021-01-01 00:15:00', '2021-01-01 00:30:00',
               '2021-01-01 00:45:00', '2021-01-01 01:00:00',
               ...
               '2025-04-18 13:00:00', '2025-04-18 13:15:00',
               '2025-04-18 13:30:00', '2025-04-18 13:45:00',
               '2025-04-18 14:00:00', '2025-04-18 14:15:00',
               '2025-04-18 14:30:00', '2025-04-18 14:45:00',
               '2025-04-18 15:00:00', '2025-04-18 15:15:00'],
              dtype='datetime64[ns]', name='Date-Time', length=150599, freq=None)
In [6]:
df_solar.describe()
Out[6]:
Solar-Power-MW
count 150594.000000
mean 6995.386827
std 10812.629715
min 0.000000
25% 0.000000
50% 95.000000
75% 11357.900000
max 53288.400000
In [7]:
# Check for missing values (nulls) in each column
missing_values = df_solar.isnull().sum()

# Print missing values
print(missing_values)
Solar-Power-MW    5
dtype: int64
In [8]:
# Find rows where 'Solar-Power-MW' is null
null_solar = df_solar[df_solar['Solar-Power-MW'].isnull()]

# Combine both filters to find rows where either column is null
null_rows = df_solar[ df_solar['Solar-Power-MW'].isnull()]

# Print the rows with missing values in either column
print(null_rows)
           Solar-Power-MW
Date-Time                
NaT                   NaN
NaT                   NaN
NaT                   NaN
NaT                   NaN
NaT                   NaN
In [9]:
# Drop rows where either 'Date-Time' or 'Solar-Power-MW' is null
df_solar = df_solar.dropna(subset=['Solar-Power-MW'])
df_modeling=df_solar

# Check the result
print(df_solar.head())
                     Solar-Power-MW
Date-Time                          
2020-12-31 23:00:00             0.0
2020-12-31 23:15:00             0.0
2020-12-31 23:30:00             0.0
2020-12-31 23:45:00             0.0
2021-01-01 00:00:00             0.0
In [12]:
# Create train (before 2025) and test (only 2025) sets
df_solar_2025 = df_solar[df_solar.index.year == 2025]
df_solar = df_solar[df_solar.index.year < 2025]

# Check the results
print("df_solar (train):", df_solar.shape)
print("df_solar_2025 (test):", df_solar_2025.shape)
df_solar (train): (140260, 1)
df_solar_2025 (test): (10334, 1)
In [13]:
plt.figure(figsize=(14, 6))

# Plot training data
plt.plot(df_solar.index, df_solar['Solar-Power-MW'], label='Train Data (Before 2025)', color='blue', alpha=0.7)

# Plot test data
#plt.plot(df_solar_2025.index, df_solar_2025['Solar-Power-MW'], label='Test Data (2025)', color='red', alpha=0.7)

plt.xlabel('Date-Time')
plt.ylabel('Solar Power (MW)')
plt.title('Solar Power Time Series: Train vs Test (2025)')
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image

Exploratory Data analysis - original time series¶

In [14]:
# Ensure index is datetime
df_solar.index = pd.to_datetime(df_solar.index)

# Define the years
years = [2021, 2022, 2023, 2024]

# Create 2x2 subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True)

# Flatten axes array
axes = axes.flatten()

# Loop through each year and plot
for i, year in enumerate(years):
    df_year = df_solar[df_solar.index.year == year]
    daily_avg = df_year.groupby(df_year.index.hour)['Solar-Power-MW'].mean()
    
    axes[i].plot(daily_avg.index, daily_avg.values, marker='o')
    axes[i].set_title(f"Avg Daily Solar Power Profile in {year}")
    axes[i].set_xlabel("Hour of Day")
    axes[i].set_ylabel("Avg Solar Power (MW)")
    axes[i].grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [15]:
# Assuming df_solar already has the Date-Time index and Solar-Power-MW column

# Create "time of day" feature from the index
df_solar['time_of_day'] = df_solar.index.time

# Convert time_of_day to minutes since midnight (or hours as a float)
df_solar['time_of_day_in_minutes'] = df_solar['time_of_day'].apply(lambda x: x.hour * 60 + x.minute)

# Group by time of day and calculate the average
avg_profile = df_solar.groupby('time_of_day_in_minutes')['Solar-Power-MW'].mean()

# Plot
plt.figure(figsize=(10,5))
plt.plot(avg_profile.index, avg_profile.values, label='Average Solar Power by Time of Day')

# Convert x-ticks to time format (HH:MM) for better readability
plt.xticks(
    ticks=range(0, 24*60, 60),  # Tick every hour
    labels=[f'{i//60:02d}:{i%60:02d}' for i in range(0, 24*60, 60)]  # Format as HH:MM
)

# Title and labels
plt.title('Typical Daily Solar Power Profile')
plt.xlabel('Time of Day')
plt.ylabel('Power (MW)')
plt.grid(True)
plt.xticks(rotation=45)
plt.legend()

# Tight layout for better fitting
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [16]:
# Ensure index is datetime
df_solar.index = pd.to_datetime(df_solar.index)

# Define the years you want to plot
years = [2021, 2022, 2023, 2024]

# Create subplots: 2 rows, 2 columns
fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True)

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Loop through years and axes
for i, year in enumerate(years):
    df_year = df_solar[df_solar.index.year == year]
    sns.boxplot(x=df_year.index.month, y=df_year['Solar-Power-MW'], ax=axes[i])
    axes[i].set_title(f"Monthly Solar Power in {year}")
    axes[i].set_xlabel("Month")
    axes[i].set_ylabel("Solar Power (MW)")
    axes[i].grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [17]:
# Plot Monthly Distribution of Solar Power using the index's month
plt.figure(figsize=(12, 6))
sns.boxplot(x=df_solar.index.month, y=df_solar['Solar-Power-MW'])
plt.title("Monthly Distribution of Solar Power")
plt.xlabel("Month")
plt.ylabel("Solar Power (MW)")
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [18]:
# Ensure datetime index
df_solar.index = pd.to_datetime(df_solar.index)

# Map months to seasons
def get_season(month):
    return {
        12: "Winter", 1: "Winter", 2: "Winter",
        3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer",
        9: "Fall",   10: "Fall",  11: "Fall"
    }[month]

# Add season and year columns
df_solar["season"] = df_solar.index.month.map(get_season)
df_solar["year"] = df_solar.index.year

# Group and sum solar power by year and season
seasonal_totals = df_solar.groupby(["year", "season"])["Solar-Power-MW"].sum().unstack(fill_value=0)

# Keep only 2021–2024
seasonal_totals = seasonal_totals.loc[[2021, 2022, 2023, 2024]]

# Sort columns to have consistent season order
season_order = ["Winter", "Spring", "Summer", "Fall"]
seasonal_totals = seasonal_totals[season_order]

# Plot stacked bar chart
seasonal_totals.plot(kind="bar", figsize=(12, 6), stacked=True, colormap="tab20c")
plt.title("Total Solar Power Generation by Season (2021-2024)")
plt.ylabel("Total Power (MW)")
plt.xlabel("Year")
plt.legend(title="Season")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [21]:
# Ensure datetime index
df_solar.index = pd.to_datetime(df_solar.index)

# Add month and year columns
df_solar["month"] = df_solar.index.month
df_solar["year"] = df_solar.index.year

# Group and sum solar power by year and month
monthly_totals = df_solar.groupby(["year", "month"])["Solar-Power-MW"].sum().unstack(fill_value=0)

# Keep only 2021–2024
monthly_totals = monthly_totals.loc[[2021, 2022, 2023, 2024]]

# Plot stacked bar chart
monthly_totals.plot(kind="bar", figsize=(12, 6), stacked=True, colormap="tab20c")
plt.title("Total Solar Power Generation by Month (2021-2024)")
plt.ylabel("Total Power (MW)")
plt.xlabel("Year")
plt.legend(title="Month", labels=[
    "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"])
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [22]:
# Ensure datetime index
df_solar.index = pd.to_datetime(df_solar.index)

# Map months to seasons
def get_season(month):
    return {
        12: "Winter", 1: "Winter", 2: "Winter",
        3: "Spring", 4: "Spring", 5: "Spring",
        6: "Summer", 7: "Summer", 8: "Summer",
        9: "Fall",   10: "Fall",  11: "Fall"
    }[month]

# Add season and year columns
df_solar["season"] = df_solar.index.month.map(get_season)
df_solar["year"] = df_solar.index.year

# Group and sum by year and season
seasonal_totals = df_solar.groupby(["year", "season"])["Solar-Power-MW"].sum().unstack(fill_value=0)

# Keep only 2021–2024
seasonal_totals = seasonal_totals.loc[[2021, 2022, 2023, 2024]]

# Sort columns by season
season_order = ["Winter", "Spring", "Summer", "Fall"]
seasonal_totals = seasonal_totals[season_order]

# Plot donut charts in 2x2 grid
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3']
axs = axs.flatten()  # Flatten for easy indexing

for i, year in enumerate(seasonal_totals.index):
    values = seasonal_totals.loc[year]
    total = values.sum()  # Total generation for the year

    # Create labels with both GWh and percentage
    labels = [
        f"{season}\n{value/1e3:.1f} GWh\n({value/total*100:.1f}%)"
        for season, value in values.items()
    ]

    axs[i].pie(
        values,
        labels=labels,
        colors=colors,
        startangle=90,
        wedgeprops={'width': 0.4}
    )
    axs[i].set_title(f"{year}")

plt.suptitle("Donut Chart of Seasonal Solar Generation (GWh) with Percentages", fontsize=16)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [23]:
# Sum all years (2021–2024) by season
total_by_season = seasonal_totals.sum(axis=0)

# Compute total sum to calculate percentages
total_sum = total_by_season.sum()

# Format labels with GWh and percentage
labels = [
    f"{season}\n{value/1e3:.1f} GWh\n{value/total_sum*100:.1f}%"
    for season, value in total_by_season.items()
]

# Plot the donut chart
fig, ax = plt.subplots(figsize=(6, 6))

ax.pie(
    total_by_season,
    labels=labels,
    colors=colors,
    startangle=90,
    wedgeprops={'width': 0.4}
)

ax.set_title("Total Seasonal Solar Generation (2021–2024)", fontsize=14)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [24]:
plt.figure(figsize=(10, 5))
sns.histplot(df_solar['Solar-Power-MW'], bins=50, kde=True, color='orange')
plt.title("Distribution of Solar Power Values")
plt.xlabel("Solar Power (MW)")
plt.ylabel("Frequency")
plt.axvline(df_solar['Solar-Power-MW'].mean(), color='blue', linestyle='--', label=f"Mean: {df_solar['Solar-Power-MW'].mean():.1f}")
plt.axvline(df_solar['Solar-Power-MW'].median(), color='green', linestyle='--', label=f"Median: {df_solar['Solar-Power-MW'].median():.1f}")
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [25]:
# Ensure index is datetime
df_solar.index = pd.to_datetime(df_solar.index)

# Define the years
years = [2021, 2022, 2023, 2024]

# Dictionary to hold stats
yearly_stats = {}

# Loop through each year and compute describe
for year in years:
    df_year = df_solar[df_solar.index.year == year]
    stats = df_year['Solar-Power-MW'].describe()
    yearly_stats[year] = stats

# Display results
for year, stats in yearly_stats.items():
    print(f"\nStatistical Summary for {year}:")
    print(stats)
Statistical Summary for 2021:
count    35040.000000
mean      5648.630100
std       8741.721706
min          0.000000
25%          0.000000
50%        112.200000
75%       9232.475000
max      38770.900000
Name: Solar-Power-MW, dtype: float64

Statistical Summary for 2022:
count    35040.000000
mean      6830.657209
std      10308.536619
min          0.000000
25%          0.000000
50%        110.050000
75%      11337.125000
max      40368.600000
Name: Solar-Power-MW, dtype: float64

Statistical Summary for 2023:
count    35040.000000
mean      7089.041047
std      10857.375718
min          0.000000
25%          0.000000
50%        115.150000
75%      11329.800000
max      45036.600000
Name: Solar-Power-MW, dtype: float64

Statistical Summary for 2024:
count    35136.000000
mean      8267.076531
std      12334.219564
min          0.000000
25%          0.000000
50%        153.700000
75%      13960.775000
max      51607.900000
Name: Solar-Power-MW, dtype: float64
In [26]:
# Basic statistical summary
stats = df_solar['Solar-Power-MW'].describe()
print(stats)
count    140260.000000
mean       6959.548172
std       10680.034551
min           0.000000
25%           0.000000
50%         121.000000
75%       11364.875000
max       51607.900000
Name: Solar-Power-MW, dtype: float64
In [27]:
# Ensure index is datetime
df_solar.index = pd.to_datetime(df_solar.index)

# Define years
years = [2021, 2022, 2023, 2024]

# Compute per-year statistics
summary_df = pd.DataFrame()
for year in years:
    df_year = df_solar[df_solar.index.year == year]
    stats = df_year['Solar-Power-MW'].describe()
    summary_df[year] = stats

summary_df = summary_df.T

# Select stats to compare
columns_to_plot = ['mean', 'max', 'std', 'min', '75%']

# Radar chart setup
labels = columns_to_plot
num_vars = len(labels)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]  # close the loop

# Initialize plot
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

# Plot each year's stats
for year in years:
    values = summary_df.loc[year, labels].tolist()
    values += values[:1]
    ax.plot(angles, values, label=str(year))
    ax.fill(angles, values, alpha=0.1)

# Add overall stats
overall_stats = df_solar['Solar-Power-MW'].describe()
overall_values = overall_stats[labels].tolist()
overall_values += overall_values[:1]
ax.plot(angles, overall_values, label='Overall', linestyle='--', linewidth=1, color='black')
ax.fill(angles, overall_values, alpha=0.1, color='gray')

# Final plot settings
ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)
ax.set_title("Radar Chart: Solar Power Statistics by Year + Overall", size=14)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.show()
No description has been provided for this image
In [26]:
#first idea remove daily trend from ts then run decomposition
In [29]:
# Step 1: Calculate the daily profile by averaging over each 15-minute interval (same time every day)
df_solar['time'] = df_solar.index.time  # Extract time from the DateTimeIndex
daily_profile = df_solar.groupby('time')['Solar-Power-MW'].mean()  # Average power at each 15-minute interval

# Step 2: Remove the daily profile from the original time series
df_solar['daily_profile'] = df_solar['time'].map(daily_profile)
df_solar['removed_dialy_profile'] = df_solar['Solar-Power-MW'] - df_solar['daily_profile']
In [30]:
def plot_solar_components(df_solar, date_str):

    # Filter for the selected date
    daily_data = df_solar.loc[date_str]

    # Plot
    plt.figure(figsize=(15, 5))
    plt.plot(daily_data.index, daily_data['Solar-Power-MW'], label='Actual Solar Power (MW)', color='blue')
    plt.plot(daily_data.index, daily_data['daily_profile'], label='Daily Profile', color='green')
    plt.plot(daily_data.index, daily_data['removed_dialy_profile'], label='Removed Daily Profile', color='orange', linestyle='--')

    plt.title(f'Solar Generation Components for {date_str}')
    plt.xlabel('Time')
    plt.ylabel('Power (MW)')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
In [31]:
plot_solar_components(df_solar, '2024-05-15')
No description has been provided for this image

Phase 1: Seasonal decomposition¶

Seasonal decomposition applied on the residual series.

In [32]:
# Step 3: Applying seasonal decomposition on the residual series
# As we're using 15-minute intervals, we set the period to 96 (15-minute data points per hour * 24 hours)
residual_series = df_solar['removed_dialy_profile']
result = sm.tsa.seasonal_decompose(residual_series, period=96*365, model='additive')

# Step 4: Extracting the components (trend, seasonal, residuals) from the decomposition
trend = result.trend
seasonal = result.seasonal
residual = result.resid

# Step 5: Ploting the components
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)

# Ploting the original series
axs[0].plot(df_solar.index, df_solar['Solar-Power-MW'], label='Original', color='blue')
axs[0].set_title('Original Solar Power Generation')

# Ploting the daily profile
axs[1].plot(df_solar.index, df_solar['daily_profile'], label='Daily Profile', color='orange')
axs[1].set_title('Daily Profile')

# Ploting the trend component from seasonal_decompose
axs[2].plot(df_solar.index, trend, label='Trend', color='green')
axs[2].set_title('Trend')

# Ploting the seasonal component from seasonal_decompose
axs[3].plot(df_solar.index, seasonal, label='Seasonal', color='red')
axs[3].set_title('Seasonal Component')

# Ploting the residual component from seasonal_decompose
axs[4].plot(df_solar.index, residual, label='Residual', color='purple')
axs[4].set_title('Residuals')

# Add labels and grid to all plots
for ax in axs:
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [33]:
# Step 5: Plot the components
fig, axs = plt.subplots(6, 1, figsize=(15, 18), sharex=True)  # Note: changed to 6 plots

# Plot the original series
axs[0].plot(df_solar.index, df_solar['Solar-Power-MW'], label='Original', color='blue')
axs[0].set_title('Original Solar Power Generation')

# Plot the daily profile
axs[1].plot(df_solar.index, df_solar['daily_profile'], label='Daily Profile', color='orange')
axs[1].set_title('Daily Profile')

# Plot the removed daily profile
axs[2].plot(df_solar.index, df_solar['removed_dialy_profile'], label='Removed Daily Profile', color='darkcyan')
axs[2].set_title('Removed Daily Profile (De-trended)')

# Plot the trend component from seasonal_decompose
axs[3].plot(df_solar.index, trend, label='Trend', color='green')
axs[3].set_title('Trend')

# Plot the seasonal component from seasonal_decompose
axs[4].plot(df_solar.index, seasonal, label='Seasonal', color='red')
axs[4].set_title('Seasonal Component')

# Plot the residual component from seasonal_decompose
axs[5].plot(df_solar.index, residual, label='Residual', color='purple')
axs[5].set_title('Residuals')

# Add labels and grid to all plots
for ax in axs:
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [34]:
# Define the time range you want to zoom into (for example, from June 1, 2022 to June 2, 2022)
start_time = pd.to_datetime('2024-04-01 00:00:00')
end_time = pd.to_datetime('2024-05-05 00:00:00')

# Filter the data to show only the desired time range
df_zoomed = df_solar[(df_solar.index >= start_time) & (df_solar.index <= end_time)]
trend_zoomed = trend[(df_solar.index >= start_time) & (df_solar.index <= end_time)]
seasonal_zoomed = seasonal[(df_solar.index >= start_time) & (df_solar.index <= end_time)]
residual_zoomed = residual[(df_solar.index >= start_time) & (df_solar.index <= end_time)]

# Step 5: Plot the components for the zoomed time range
fig, axs = plt.subplots(6, 1, figsize=(15, 15), sharex=True)

# Plot the original series
axs[0].plot(df_zoomed.index, df_zoomed['Solar-Power-MW'], label='Original', color='blue')
axs[0].set_title('Original Solar Power Generation')

# Plot the daily profile
axs[1].plot(df_zoomed.index, df_zoomed['daily_profile'], label='Daily Profile', color='orange')
axs[1].set_title('Daily Profile')

# Plot the removed daily profile
axs[2].plot(df_zoomed.index, df_zoomed['removed_dialy_profile'], label='Removed Daily Profile', color='darkcyan')
axs[2].set_title('Removed Daily Profile')

# Plot the trend component from seasonal_decompose
axs[3].plot(df_zoomed.index, trend_zoomed, label='Trend', color='green')
axs[3].set_title('Trend')

# Plot the seasonal component from seasonal_decompose
axs[4].plot(df_zoomed.index, seasonal_zoomed, label='Seasonal', color='red')
axs[4].set_title('Seasonal Component')

# Plot the residual component from seasonal_decompose
axs[5].plot(df_zoomed.index, residual_zoomed, label='Residual', color='purple')
axs[5].set_title('Residuals')

# Add labels and grid to all plots
for ax in axs:
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [35]:
components = ['Trend', 'Seasonal', 'Residual']
stds = [trend.std(), seasonal.std(), residual.std()]

plt.figure(figsize=(8, 5))
plt.bar(components, stds, color=['steelblue', 'orange', 'gray'])
plt.title('Standard Deviation of Decomposed Components')
plt.ylabel('Standard Deviation (MW)')
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [36]:
plt.figure(figsize=(8, 4))
sns.histplot(residual, kde=True, bins=30, color='gray')
plt.title("Distribution of Residuals")
plt.xlabel("Residual Value (MW)")
plt.grid(True)
plt.tight_layout()
plt.show()

print("Residual Mean:", residual.mean())
print("Residual std:", residual.std())
print("Residual Skewness:", residual.skew())
print("Residual Kurtosis:", residual.kurtosis())
No description has been provided for this image
Residual Mean: -42.04304960633642
Residual std: 2919.0132377156315
Residual Skewness: 0.2683833666850075
Residual Kurtosis: 6.3338435086884095
In [37]:
from statsmodels.graphics.tsaplots import plot_acf
# Drop NaNs from residuals
residual_clean = residual.dropna()

# Plot ACF
plt.figure(figsize=(8, 4))
plot_acf(residual_clean, lags=1000)
plt.title("Autocorrelation of Residuals")
plt.tight_layout()
plt.show()
<Figure size 800x400 with 0 Axes>
No description has been provided for this image

✅ Steps to Recompose the Series

Recompose by:
Synthetic Series=Trend+Seasonal+Daily Profile+Synthetic Residuals

Recompose the time series¶

Recomposed by: Synthetic Series = Trend + Seasonal + Daily Profile + Synthetic Residuals

In [45]:
from sklearn.linear_model import LinearRegression


# Step 1: Prepare the data (drop NaN values from the trend component)
df_solar['trend_decomp'] = result.trend  # Assuming 'result' is your seasonal decomposition result

# Drop NaN values from the trend component for linear regression
df_trend = df_solar[['trend_decomp']].dropna().copy()

# Calculate time in minutes from the first data point (for the independent variable)
df_trend['time_num'] = (df_trend.index - df_trend.index[0]).total_seconds() / 60  # in minutes

# Step 2: Fit linear regression
X = df_trend[['time_num']]  # Independent variable: time in minutes
y = df_trend['trend_decomp']  # Dependent variable: trend values from decomposition
model = LinearRegression().fit(X, y)  # Fit the model

# Step 3: Predict the linear trend over the entire time range
df_solar['time_num'] = (df_solar.index - df_trend.index[0]).total_seconds() / 60  # Recalculate time for the full dataset
df_solar['linear_trend'] = model.predict(df_solar[['time_num']])  # Predict trend using linear regression

# Step 4: Plot the results
plt.figure(figsize=(15, 5))

# Plot the original trend component and the smoothed linear trend
plt.plot(df_solar.index, df_solar['trend_decomp'], label='Original Trend', alpha=0.5)
plt.plot(df_solar.index, df_solar['linear_trend_decomp'], label='Smoothed Linear Trend', color='red')

# Add labels, grid, and title
plt.legend()
plt.grid(True)
plt.title('Original Trend vs Linear Smoothed Trend')
plt.xlabel('Date')
plt.ylabel('Trend Power (MW)')

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [46]:
# Ensure everything is aligned
df_solar['trend'] = result.trend
df_solar['seasonal'] = result.seasonal
df_solar['residual'] = result.resid

# Estimate daily profile again (as before)
df_solar['time'] = df_solar.index.time
daily_profile = df_solar.groupby('time')['Solar-Power-MW'].median()
df_solar['daily_profile'] = df_solar['time'].map(daily_profile)
In [47]:
# Estimate residual stats
resid_mean = df_solar['residual'].mean()
resid_std = df_solar['residual'].std()

# Simulate new noise
np.random.seed(42)  # for reproducibility
synthetic_residual = np.random.normal(loc=resid_mean, scale=resid_std, size=len(df_solar))
In [48]:
# Plot actual vs synthetic residuals
plt.figure(figsize=(15, 5))
plt.plot(df_solar.index, df_solar['residual'], label='Actual Residual', color='purple', alpha=0.7)
plt.plot(df_solar.index, synthetic_residual, label='Synthetic Residual', color='grey', linestyle='--', alpha=0.7)

plt.title("Actual vs Synthetic Residuals")
plt.xlabel("Date-Time")
plt.ylabel("Residual")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [49]:
from scipy.stats import norm

# Fit a normal distribution to the residuals
mu, std = norm.fit(df_solar['residual'].dropna())

# Plot the histogram and the fitted normal distribution
plt.figure(figsize=(12, 6))
sns.histplot(df_solar['residual'], kde=True, color='purple', bins=50, stat='density')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
title = "Fit results: mean = %.2f,  std = %.2f" % (mu, std)
plt.title(title)
plt.xlabel('Residual Value')
plt.ylabel('Density')
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [50]:
# Build the synthetic time series
df_solar['synthetic'] = (
    df_solar['linear_trend'].fillna(0) +
    #df_solar['polynomial_trend'].fillna(0) + 
    df_solar['seasonal'].fillna(0) +
    df_solar['daily_profile'].fillna(0) +
    synthetic_residual
)

#clip the recomposed series to avoid negatives:
df_solar['synthetic'] = df_solar['synthetic'].clip(lower=0)

# -------------------------------
# Plot the synthetic vs. original
# -------------------------------
fig, axs = plt.subplots(2, 1, figsize=(15, 8), sharex=True)

axs[0].plot(df_solar.index, df_solar['Solar-Power-MW'], label='Original', alpha=0.7)
axs[0].set_title('Original Solar Power Generation')
axs[0].legend()
axs[0].grid(True)

axs[1].plot(df_solar.index, df_solar['synthetic'], label='Synthetic', color='orange', alpha=0.7)
axs[1].set_title('Synthetic Recomposition')
axs[1].legend()
axs[1].grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [51]:
plt.figure(figsize=(15, 6))
plt.plot(df_solar.index, df_solar['Solar-Power-MW'], label='Original', alpha=0.7)
plt.plot(df_solar.index, df_solar['synthetic'], label='Synthetic', color='orange', alpha=0.6)
plt.title('Original vs Synthetic Solar Power Generation')
plt.xlabel('Time')
plt.ylabel('Power (MW)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [54]:
def plot_comparison_range(df_combined, start, end):
    # Ensure the DataFrame is sorted by the DatetimeIndex
    df_combined = df_combined.sort_index()

    # Slice the data within the provided range
    df_window = df_combined.loc[start:end]

    # Plot the real and synthetic power
    df_window[["Solar-Power-MW", "synthetic"]].plot(figsize=(14, 4))
    
    plt.title(f"Real vs Synthetic Power ({start} to {end})")
    plt.xlabel("Date")
    plt.ylabel("Power (MW)")
    plt.legend(["Real Power", "Synthetic Power"])
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
plot_comparison_range(df_solar, "2024-05-13", "2024-05-16")
No description has been provided for this image
In [55]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def plot_kde_and_metrics(df_combined):
    # Ensure no negative values in synthetic power
   # df_combined["synthetic"] = np.clip(df_combined["synthetic"], 0, None)

    # KDE distribution comparison
    y_true = df_solar["Solar-Power-MW"]
    y_pred = df_solar["synthetic"]

    plt.figure(figsize=(10, 4))
    sns.kdeplot(y_true, label="Real", shade=True)
    sns.kdeplot(y_pred, label="Synthetic", shade=True)
    plt.title("Distribution Comparison")
    plt.xlabel("Power (MW)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Evaluation metrics
    y_true = y_true.values
    y_pred = y_pred.values

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-3))) * 100
    r2 = r2_score(y_true, y_pred)

    print(f"🔎 Evaluation Metrics:")
    print(f"   MAE  = {mae:.2f} MW")
    print(f"   RMSE = {rmse:.2f} MW")
    print(f"   R²   = {r2:.3f}")

# Example usage:
plot_kde_and_metrics(df_solar)
No description has been provided for this image
🔎 Evaluation Metrics:
   MAE  = 2587.95 MW
   RMSE = 4197.01 MW
   R²   = 0.846
In [56]:
df_solar['residual_daily']=df_solar["Solar-Power-MW"] - df_solar["synthetic"]
In [57]:
from statsmodels.graphics.gofplots import qqplot
qqplot(df_solar['residual_daily'].dropna(), line='s')
plt.title("QQ Plot of Residuals")
plt.grid(True)
plt.show()
No description has been provided for this image

Phase 2: Forecasting models¶

In [58]:
# Step 1: Convert index to datetime (if not already)
df_modeling.index = pd.to_datetime(df_modeling.index)

# Step 2: Resample to hourly mean
df_hourly = df_modeling.resample("H").mean()

# Step 3: Split into train and test sets
train_series = df_hourly[df_hourly.index < "2025-01-01"]
test_series_2025 = df_hourly[df_hourly.index >= "2025-01-01"]

# Step 4: Create second test_series between 2025-01-01 and 2025-02-01
test_series = df_hourly[(df_hourly.index >= "2025-01-01") & (df_hourly.index <= "2025-02-01")]
In [59]:
plt.figure(figsize=(14,6))
plt.plot(train_series.index, train_series, label='Train (Hourly)', alpha=0.8)
plt.plot(test_series_2025.index, test_series_2025, label='Test 2025 (Hourly)', alpha=0.8, color='orange')

plt.title("Hourly Solar Power - Train vs Test (2025)")
plt.xlabel("Date")
plt.ylabel("Solar Power (MW)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [60]:
print("Train NaNs:", train_series.isna().sum())
print("Test NaNs:", test_series.isna().sum())
print("Test NaNs:", test_series_2025.isna().sum())
Train NaNs: Solar-Power-MW    7
dtype: int64
Test NaNs: Solar-Power-MW    0
dtype: int64
Test NaNs: Solar-Power-MW    1
dtype: int64
In [61]:
train_series = train_series.ffill()
test_series = test_series.ffill()
In [62]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
residuals = df_hourly.diff().dropna()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
plot_acf(residuals, ax=axes[0], lags=100)
plot_pacf(residuals, ax=axes[1], lags=100)
plt.show()
No description has been provided for this image
In [63]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
residuals = df_hourly.diff(periods=24).dropna()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
plot_acf(residuals, ax=axes[0], lags=100)
plot_pacf(residuals, ax=axes[1], lags=100)
plt.show()
No description has been provided for this image
In [64]:
#pip install pmdarima
In [65]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pmdarima as pm

Naive Forecast¶

✅ Naive Forecast

In [66]:
# Naive forecast: repeat the last value of the training set
train_series_smooth = train_series.rolling(window=24).mean()  # 24-hour rolling window (1 day)
last_non_zero_smooth = train_series_smooth.dropna().iloc[-1]

naive_forecast = pd.Series(
    [last_non_zero_smooth] * len(test_series),
    index=test_series.index
)
In [67]:
plt.figure(figsize=(14,6))
plt.plot(test_series.index, test_series, label="Actual 2025", alpha=0.8)
plt.plot(naive_forecast.index, naive_forecast, label="Naive Forecast", linestyle='--')
plt.title("Naive Forecast vs Actual (2025)")
plt.xlabel("Date")
plt.ylabel("Solar Power (MW)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [68]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae_naive = mean_absolute_error(test_series, naive_forecast)
rmse_naive = np.sqrt(mean_squared_error(test_series, naive_forecast))

print(f"Naive Forecast → MAE: {mae_naive:.2f}, RMSE: {rmse_naive:.2f}")
Naive Forecast → MAE: 4466.05, RMSE: 5686.62

Holt-Winters Exponential Smoothing¶

In [69]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Fit the Holt-Winters model
hw_model = ExponentialSmoothing(
    train_series, 
    trend='add',  # Additive trend (you can try 'multiplicative' if needed)
    seasonal='add',  # Additive seasonality (use 'multiplicative' if required)
    seasonal_periods=120  # Periodicity of data (e.g., 24 hours for daily seasonality)
).fit()

# Forecast the test period
hw_forecast = hw_model.forecast(len(test_series))
In [70]:
# Plot the Holt-Winters forecast against the actual data
plt.figure(figsize=(14,6))
plt.plot(test_series.index, test_series, label="Actual 2025", alpha=0.8)
plt.plot(hw_forecast.index, hw_forecast, label="Holt-Winters Forecast", linestyle='--', color='orange')
plt.title("Holt-Winters Forecast vs Actual (2025)")
plt.xlabel("Date")
plt.ylabel("Solar Power (MW)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [80]:
offset = 1e-2  # or a small constant like 0.1 MW

## When using multiplicative trend or seasonality in Holt-Winters models, the time series values must be strictly positive (i.e., no zeros or negatives) because multiplication with zero or negative values breaks the math behind the model.

train_shifted = train_series + offset
from statsmodels.tsa.holtwinters import ExponentialSmoothing

hw_model_mul = ExponentialSmoothing(
    train_shifted,
    trend='add',  # 'add' 
    seasonal='mul',
    seasonal_periods=24
).fit()

# Forecast (still shifted)
hw_forecast_shifted = hw_model_mul.forecast(len(test_series))

# Remove the offset
hw_forecast = hw_forecast_shifted - offset
In [81]:
# AIC, BIC, and Log-Likelihood for the multiplicative HW model
print(f"AIC: {hw_model_mul.aic}")
print(f"BIC: {hw_model_mul.bic}")
AIC: 536308.9017702524
BIC: 536545.9206155596
In [82]:
# Plot the Holt-Winters forecast against the actual data
plt.figure(figsize=(14,6))
plt.plot(test_series.index, test_series, label="Actual 2025", alpha=0.8)
plt.plot(hw_forecast.index, hw_forecast, label="Holt-Winters Forecast", linestyle='--', color='orange')
plt.title("Holt-Winters Forecast vs Actual (2025)")
plt.xlabel("Date")
plt.ylabel("Solar Power (MW)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image

Auto ARIMA with full search¶

In [63]:
from pmdarima import auto_arima

# Run auto ARIMA with extended search space
#auto_model = auto_arima(
    train_series,
    start_p=20, max_p=30,
    start_q=20, max_q=30,
    d=None,               # Let it test both 1 and 2 automatically
    max_d=1,
    seasonal=False,       # This is plain ARIMA, no seasonality
    stepwise=True,       # False Full grid search (slower, but thorough)
    suppress_warnings=True,
    error_action='ignore',
    trace=True             # Show progress
)
  Cell In[63], line 5
    train_series,
    ^
IndentationError: unexpected indent
In [64]:
data = [
    ("ARIMA(0,1,0)", 664538.602, 0.47),
    ("ARIMA(1,1,0)", 611002.339, 0.53),
    ("ARIMA(0,1,1)", 623140.364, 5.58),
    ("ARIMA(0,1,0)", 664536.602, 0.34),
    ("ARIMA(2,1,0)", 578491.824, 1.67),
    ("ARIMA(3,1,0)", 575178.657, 1.71),
    ("ARIMA(4,1,0)", 573039.854, 2.21),
    ("ARIMA(5,1,0)", 573006.374, 2.68),
    ("ARIMA(6,1,0)", 572704.005, 3.41),
    ("ARIMA(7,1,0)", 572668.630, 3.66),
    ("ARIMA(8,1,0)", 572629.083, 5.89),
    ("ARIMA(9,1,0)", 572586.497, 6.62),
    ("ARIMA(10,1,0)", 572146.924, 8.55),
    ("ARIMA(11,1,0)", 570859.265, 9.52),
    ("ARIMA(12,1,0)", 569172.180, 11.72),
    ("ARIMA(13,1,0)", 567326.236, 13.05),
    ("ARIMA(14,1,0)", 565768.238, 16.47),
    ("ARIMA(15,1,0)", 563955.884, 15.40),
    ("ARIMA(16,1,0)", float('inf'), 20.74),
    ("ARIMA(15,1,1)", 565266.584, 139.70),
    ("ARIMA(14,1,1)", 558690.672, 165.96),
    ("ARIMA(13,1,1)", 568660.788, 121.95),
    ("ARIMA(14,1,2)", 559207.141, 169.38),
    ("ARIMA(13,1,2)", 560146.496, 140.64),
    ("ARIMA(15,1,2)", 557983.272, 160.61),
    ("ARIMA(16,1,2)", 556901.923, 217.41),
    ("ARIMA(16,1,1)", 555812.983, 214.47),
    ("ARIMA(17,1,1)", 553944.294, 217.75),
    ("ARIMA(17,1,0)", float('inf'), 23.44),
    ("ARIMA(18,1,1)", 552388.984, 240.75),
    ("ARIMA(18,1,0)", float('inf'), 25.90),
]

df = pd.DataFrame(data, columns=["Model", "AIC", "Time_sec"])
print(df)
            Model         AIC  Time_sec
0    ARIMA(0,1,0)  664538.602      0.47
1    ARIMA(1,1,0)  611002.339      0.53
2    ARIMA(0,1,1)  623140.364      5.58
3    ARIMA(0,1,0)  664536.602      0.34
4    ARIMA(2,1,0)  578491.824      1.67
5    ARIMA(3,1,0)  575178.657      1.71
6    ARIMA(4,1,0)  573039.854      2.21
7    ARIMA(5,1,0)  573006.374      2.68
8    ARIMA(6,1,0)  572704.005      3.41
9    ARIMA(7,1,0)  572668.630      3.66
10   ARIMA(8,1,0)  572629.083      5.89
11   ARIMA(9,1,0)  572586.497      6.62
12  ARIMA(10,1,0)  572146.924      8.55
13  ARIMA(11,1,0)  570859.265      9.52
14  ARIMA(12,1,0)  569172.180     11.72
15  ARIMA(13,1,0)  567326.236     13.05
16  ARIMA(14,1,0)  565768.238     16.47
17  ARIMA(15,1,0)  563955.884     15.40
18  ARIMA(16,1,0)         inf     20.74
19  ARIMA(15,1,1)  565266.584    139.70
20  ARIMA(14,1,1)  558690.672    165.96
21  ARIMA(13,1,1)  568660.788    121.95
22  ARIMA(14,1,2)  559207.141    169.38
23  ARIMA(13,1,2)  560146.496    140.64
24  ARIMA(15,1,2)  557983.272    160.61
25  ARIMA(16,1,2)  556901.923    217.41
26  ARIMA(16,1,1)  555812.983    214.47
27  ARIMA(17,1,1)  553944.294    217.75
28  ARIMA(17,1,0)         inf     23.44
29  ARIMA(18,1,1)  552388.984    240.75
30  ARIMA(18,1,0)         inf     25.90
In [103]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model on training data
arima_model = ARIMA(train_series, order=(24, 0,1 )).fit()
In [104]:
forecast_object = arima_model.get_forecast(steps=len(test_series))
arima_model_forecast = forecast_object.predicted_mean  # Forecasted values
arima_model_conf_int = forecast_object.conf_int(alpha=0.05)  # Confidence intervals
arima_model_stderr = forecast_object.se_mean  # Standard errors of the forecasts
In [105]:
# Ensure the forecast and confidence intervals are in the correct format

# Filter the data for January 2025 (or any other date range you want)
start_date = '2025-01-01'
end_date = '2025-01-31'
actual = test_series.loc[start_date:end_date]
forecasted = arima_model_forecast.loc[start_date:end_date]
ci = arima_model_conf_int.loc[start_date:end_date]

# Plotting the actual vs forecasted values, including confidence intervals
plt.figure(figsize=(15, 5))
plt.plot(actual.index, actual, label='Actual', color='blue', linewidth=2)
plt.plot(forecasted.index, forecasted, label='ARIMA Forecast', linestyle='--', color='orange', linewidth=2)

# Plot the confidence intervals as shaded regions
plt.fill_between(ci.index, ci["lower Solar-Power-MW"], ci["upper Solar-Power-MW"], color='orange', alpha=0.2, label='Confidence Interval')

# Adding titles and labels
plt.title('Solar Generation Forecast vs Actual (Jan 2025)', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Solar Power (MW)', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [106]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model on training data
arima_model1 = ARIMA(train_series, order=(4, 1, 3)).fit()
In [107]:
forecast_object = arima_model1.get_forecast(steps=len(test_series))
arima_model1_forecast = forecast_object.predicted_mean  # Forecasted values
arima_model1_conf_int = forecast_object.conf_int(alpha=0.05)  # Confidence intervals
arima_model1_stderr = forecast_object.se_mean  # Standard errors of the forecasts
In [108]:
# Ensure the forecast and confidence intervals are in the correct format

# Filter the data for January 2025 (or any other date range you want)
start_date = '2025-01-01'
end_date = '2025-01-31'
actual = test_series.loc[start_date:end_date]
forecasted = arima_model1_forecast.loc[start_date:end_date]
ci = arima_model_conf_int.loc[start_date:end_date]

# Plotting the actual vs forecasted values, including confidence intervals
plt.figure(figsize=(15, 5))
plt.plot(actual.index, actual, label='Actual', color='blue', linewidth=2)
plt.plot(forecasted.index, forecasted, label='ARIMA(4,1,3) Forecast', linestyle='--', color='orange', linewidth=2)

# Plot the confidence intervals as shaded regions
plt.fill_between(ci.index, ci["lower Solar-Power-MW"], ci["upper Solar-Power-MW"], color='orange', alpha=0.2, label='Confidence Interval')

# Adding titles and labels
plt.title('ARIMA(4,1,3) Forecast vs Actual (2025)', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Solar Power (MW)', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image

Sarima¶

In [109]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

sarima_model = SARIMAX(
    train_series,
    order=(4, 1, 3),               # ARIMA(p,d,q)
    seasonal_order=(1, 1, 1, 24),  # (P,D,Q,s) — daily seasonality (24 hours)
    enforce_stationarity=False,
    enforce_invertibility=False
)
In [110]:
results = sarima_model.fit(disp=False)
In [111]:
results.summary()
Out[111]:
SARIMAX Results
Dep. Variable: Solar-Power-MW No. Observations: 35065
Model: SARIMAX(4, 1, 3)x(1, 1, [1], 24) Log Likelihood -266497.819
Date: Sun, 11 May 2025 AIC 533015.639
Time: 17:11:45 BIC 533100.273
Sample: 12-31-2020 HQIC 533042.596
- 12-31-2024
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 1.1081 0.378 2.929 0.003 0.367 1.850
ar.L2 0.2354 0.700 0.336 0.737 -1.137 1.608
ar.L3 -0.6654 0.439 -1.516 0.130 -1.526 0.195
ar.L4 0.2010 0.091 2.211 0.027 0.023 0.379
ma.L1 0.1230 0.379 0.325 0.745 -0.619 0.865
ma.L2 -0.8552 0.235 -3.643 0.000 -1.315 -0.395
ma.L3 -0.2660 0.144 -1.850 0.064 -0.548 0.016
ar.S.L24 0.1422 0.006 22.235 0.000 0.130 0.155
ma.S.L24 -0.8157 0.004 -203.226 0.000 -0.824 -0.808
sigma2 3.389e+05 1903.773 178.003 0.000 3.35e+05 3.43e+05
Ljung-Box (L1) (Q): 0.05 Jarque-Bera (JB): 60401.67
Prob(Q): 0.82 Prob(JB): 0.00
Heteroskedasticity (H): 1.79 Skew: 0.13
Prob(H) (two-sided): 0.00 Kurtosis: 9.43


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [112]:
forecast = results.get_forecast(len(test_series))
sarima_forecast_mean = forecast.predicted_mean
sarima_conf_int = forecast.conf_int()
In [113]:
sarima_conf_int
Out[113]:
lower Solar-Power-MW upper Solar-Power-MW
2025-01-01 00:00:00 -1152.730463 1129.185992
2025-01-01 01:00:00 -2821.874072 2757.401463
2025-01-01 02:00:00 -4452.790724 4335.476313
2025-01-01 03:00:00 -5762.797837 5603.661408
2025-01-01 04:00:00 -6685.282639 6489.346621
... ... ...
2025-01-31 20:00:00 -14400.930530 11897.007586
2025-01-31 21:00:00 -14413.398482 11885.740783
2025-01-31 22:00:00 -14414.002540 11886.352027
2025-01-31 23:00:00 -14414.613338 11886.965433
2025-02-01 00:00:00 -14422.410710 11889.702999

745 rows × 2 columns

In [114]:
# Filter for January 2025 (or any other date range you want)
start_date = '2025-01-01'
end_date = '2025-01-31'
actual = test_series.loc[start_date:end_date]
forecasted = sarima_forecast_mean.loc[start_date:end_date]
ci = sarima_conf_int.loc[start_date:end_date]

# Plotting the actual vs forecasted values, including confidence intervals
plt.figure(figsize=(15, 5))
plt.plot(actual.index, actual, label='Actual', color='blue', linewidth=2)
plt.plot(forecasted.index, forecasted, label='SARIMA Forecast', linestyle='--', color='orange', linewidth=2)

# Plot the confidence intervals as shaded regions
plt.fill_between(ci.index, ci["lower Solar-Power-MW"], ci["upper Solar-Power-MW"], color='orange', alpha=0.2, label='Confidence Interval')

# Adding titles and labels
plt.title('SARIMA Forecast vs Actual (Jan 2025)', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Solar Power (MW)', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image

Sarimax¶

In [ ]:
#!pip install meteostat
In [ ]:
#!pip install pvlib
In [83]:
from meteostat import Hourly, Stations
from datetime import datetime

# Step 1: Define the date range
start = datetime(2021, 1, 1)
end = datetime.today()

# Step 2: Fetch Leipzig station directly
stations = Stations()
station_leipzig = stations.region('DE').fetch().query('name.str.contains("Leipzig", case=False)')

# Step 3: Fetch weather data for Leipzig station
station_id = station_leipzig.index[0]
data = Hourly(station_id, start, end)
weather = data.fetch()

# Step 4: Specify the weather columns you want to keep
required_columns = ['temp', 'rhum', 'prcp', 'snow', 'wspd', 'coco']

# Step 5: Filter and fill missing data
weather_filtered = weather[required_columns].fillna(method='ffill')

# Step 6: Preview the data
print(weather_filtered.head())
                     temp   rhum  prcp  snow  wspd  coco
time                                                    
2021-01-01 00:00:00   2.8   77.0   0.0   0.0  13.0   4.0
2021-01-01 01:00:00   2.0   81.0   0.0   0.0  11.2   3.0
2021-01-01 02:00:00   2.0   81.0   0.0   0.0   9.4   4.0
2021-01-01 03:00:00   2.0   87.0   0.0   0.0  11.2   7.0
2021-01-01 04:00:00   1.0  100.0   0.1   0.0  13.0  14.0
In [84]:
import pvlib
from datetime import datetime

# Define time range
start = datetime(2021, 1, 1)
end = datetime.today()

# Create hourly time index for Europe/Berlin timezone
times = pd.date_range(start=start, end=end, freq='H', tz='Europe/Berlin')

# Define Leipzig location (latitude and longitude)
# Leipzig coordinates: 51.3397° N, 12.3731° E
location = pvlib.location.Location(latitude=51.3397, longitude=12.3731, tz='Europe/Berlin')

# Get solar position data
solpos = location.get_solarposition(times)

# Get clear-sky irradiance using Ineichen model
clearsky = location.get_clearsky(times, model='ineichen')

# Combine solar position and irradiance into one DataFrame
solar_data_leipzig = pd.concat([solpos, clearsky], axis=1)
# Step 1: Reset index to access the datetime
solar_data_leipzig = solar_data_leipzig.reset_index()

# Step 2: Combine the date and time into a single Date-Time column
solar_data_leipzig['Date-Time'] = solar_data_leipzig['index'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Step 3: Keep only the required columns: Date-Time, ghi, dni, dhi
solar_cleaned = solar_data_leipzig[['Date-Time', 'ghi', 'dni', 'dhi']]

# Preview cleaned data
print(solar_cleaned.head())
             Date-Time  ghi  dni  dhi
0  2021-01-01 00:00:00  0.0  0.0  0.0
1  2021-01-01 01:00:00  0.0  0.0  0.0
2  2021-01-01 02:00:00  0.0  0.0  0.0
3  2021-01-01 03:00:00  0.0  0.0  0.0
4  2021-01-01 04:00:00  0.0  0.0  0.0
In [85]:
# Align indexes and join
# Assuming solar_cleaned, weather_filtered, and df_hourly are already defined

# Ensure the time index is properly set for each DataFrame
solar_cleaned['Date-Time'] = pd.to_datetime(solar_cleaned['Date-Time'])
weather_filtered.index = pd.to_datetime(weather_filtered.index)
df_hourly.index = pd.to_datetime(df_hourly.index)

# Merge the dataframes on the 'Date-Time' or time index
# Adjust according to the names of your actual dataframes and columns

# First, merge solar_cleaned with weather_filtered on the Date-Time
merged_data_1 = pd.merge(solar_cleaned, weather_filtered, how='inner', left_on='Date-Time', right_index=True)

# Then, merge the result with df_hourly on the Date-Time index
df_model_sarima = pd.merge(merged_data_1, df_hourly, how='inner', left_on='Date-Time', right_index=True)

# Step 3: Preview the merged data
print(df_model_sarima.head())
            Date-Time  ghi  dni  dhi  temp   rhum  prcp  snow  wspd  coco  \
0 2021-01-01 00:00:00  0.0  0.0  0.0   2.8   77.0   0.0   0.0  13.0   4.0   
1 2021-01-01 01:00:00  0.0  0.0  0.0   2.0   81.0   0.0   0.0  11.2   3.0   
2 2021-01-01 02:00:00  0.0  0.0  0.0   2.0   81.0   0.0   0.0   9.4   4.0   
3 2021-01-01 03:00:00  0.0  0.0  0.0   2.0   87.0   0.0   0.0  11.2   7.0   
4 2021-01-01 04:00:00  0.0  0.0  0.0   1.0  100.0   0.1   0.0  13.0  14.0   

   Solar-Power-MW  
0            0.00  
1            0.00  
2            0.00  
3            0.65  
4            1.45  
In [87]:
# Convert 'Date-Time' column to datetime
df_model_sarima['Date-Time'] = pd.to_datetime(df_model_sarima['Date-Time'])
In [8]:
#df_model_sarima.to_excel("/Users/mac/Desktop/df_model_sarimax.xlsx", index=False)
df_model_sarima = pd.read_excel("df_model_sarimax.xlsx")
In [9]:
# Set 'Date-Time' as the index for time series modeling
df_model_sarima.set_index('Date-Time', inplace=True)
In [10]:
# Plot correlation heatmap
# Compute correlation matrix
corr = df_model_sarima.corr()

# Plot heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()
No description has been provided for this image
In [11]:
df_model_sarima
Out[11]:
ghi dni dhi temp rhum prcp snow wspd coco Solar-Power-MW
Date-Time
2021-01-01 00:00:00 0.000000 0.000000 0.000000 2.8 77 0.0 0 13.0 4 0.000
2021-01-01 01:00:00 0.000000 0.000000 0.000000 2.0 81 0.0 0 11.2 3 0.000
2021-01-01 02:00:00 0.000000 0.000000 0.000000 2.0 81 0.0 0 9.4 4 0.000
2021-01-01 03:00:00 0.000000 0.000000 0.000000 2.0 87 0.0 0 11.2 7 0.650
2021-01-01 04:00:00 0.000000 0.000000 0.000000 1.0 100 0.1 0 13.0 14 1.450
... ... ... ... ... ... ... ... ... ... ...
2025-04-18 11:00:00 624.134576 759.986796 117.729515 12.3 81 0.0 0 14.8 4 12999.775
2025-04-18 12:00:00 702.420417 787.759724 124.219684 13.2 79 0.0 0 14.8 4 12679.300
2025-04-18 13:00:00 734.672056 797.994445 126.779189 13.7 77 0.0 0 14.8 4 11320.350
2025-04-18 14:00:00 718.490649 792.939367 125.502551 13.4 82 0.0 0 14.8 4 9187.225
2025-04-18 15:00:00 655.105964 771.511568 120.348234 12.5 85 0.0 0 14.8 4 7251.050

37647 rows × 10 columns

In [93]:
# Ensure the index is datetime type
df_model_sarima.index = pd.to_datetime(df_model_sarima.index)

# Split into train and test
train_sarimax = df_model_sarima[df_model_sarima.index.year < 2025]
test_sarimax = df_model_sarima[df_model_sarima.index.year == 2025]

# Show summary
print("Train shape:", train_sarimax.shape)
print("Test shape:", test_sarimax.shape)
Train shape: (35064, 10)
Test shape: (2583, 10)
In [94]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Set up SARIMAX model (example with arbitrary parameters)
sarima_model = SARIMAX(train_sarimax['Solar-Power-MW'],
                       exog=train_sarimax[['dhi', 'temp', 'rhum', 'coco']],
                       order=(4, 1, 3), 
                       seasonal_order=(1, 1, 1, 24))  # Example seasonal order with daily seasonality

# Fit the model
sarima_results = sarima_model.fit()
 This problem is unconstrained.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           14     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  7.70176D+00    |proj g|=  1.01140D-01

At iterate    5    f=  7.64824D+00    |proj g|=  3.98328D-02

At iterate   10    f=  7.64511D+00    |proj g|=  2.79759D-03

At iterate   15    f=  7.64130D+00    |proj g|=  9.24587D-03

At iterate   20    f=  7.63748D+00    |proj g|=  4.47414D-03

At iterate   25    f=  7.63644D+00    |proj g|=  2.53013D-03

At iterate   30    f=  7.63622D+00    |proj g|=  5.73130D-04

At iterate   35    f=  7.63531D+00    |proj g|=  5.01653D-03

At iterate   40    f=  7.61680D+00    |proj g|=  5.22575D-02

At iterate   45    f=  7.61134D+00    |proj g|=  4.32684D-03

At iterate   50    f=  7.61063D+00    |proj g|=  2.56350D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   14     50     56      1     0     0   2.563D-03   7.611D+00
  F =   7.6106348117808462     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
In [91]:
## Print summary of the results
print(sarima_results.summary())
                                      SARIMAX Results                                       
============================================================================================
Dep. Variable:                       Solar-Power-MW   No. Observations:                35064
Model:             SARIMAX(4, 1, 3)x(1, 1, [1], 24)   Log Likelihood             -266863.441
Date:                              Sun, 11 May 2025   AIC                         533754.883
Time:                                      05:41:28   BIC                         533873.382
Sample:                                           0   HQIC                        533792.626
                                            - 35064                                         
Covariance Type:                                opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
dhi           -9.0816      1.400     -6.486      0.000     -11.826      -6.337
temp          37.7637      2.467     15.305      0.000      32.928      42.600
rhum           3.6234      0.415      8.741      0.000       2.811       4.436
coco          15.5594      0.517     30.113      0.000      14.547      16.572
ar.L1          0.8578      0.009     96.551      0.000       0.840       0.875
ar.L2          0.6759      0.008     84.319      0.000       0.660       0.692
ar.L3         -0.9269      0.008   -110.713      0.000      -0.943      -0.911
ar.L4          0.2522      0.008     30.984      0.000       0.236       0.268
ma.L1          0.3417      0.011     32.507      0.000       0.321       0.362
ma.L2         -0.9989      0.006   -154.617      0.000      -1.012      -0.986
ma.L3         -0.3427      0.009    -38.146      0.000      -0.360      -0.325
ar.S.L24       0.1539      0.005     29.355      0.000       0.144       0.164
ma.S.L24      -0.8063      0.003   -244.618      0.000      -0.813      -0.800
sigma2      2.912e+05   6.36e-06   4.58e+10      0.000    2.91e+05    2.91e+05
===================================================================================
Ljung-Box (L1) (Q):                   0.12   Jarque-Bera (JB):             50580.52
Prob(Q):                              0.73   Prob(JB):                         0.00
Heteroskedasticity (H):               1.71   Skew:                             0.19
Prob(H) (two-sided):                  0.00   Kurtosis:                         8.87
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.62e+28. Standard errors may be unstable.
In [95]:
# Plot diagnostics for the fitted SARIMAX model
sarima_results.plot_diagnostics(figsize=(12, 8))
plt.show()
No description has been provided for this image
In [96]:
# Extract just January 2025
exog_january = test_sarimax.loc['2025-01-01':'2025-02-01', ['dhi', 'temp', 'rhum', 'coco']]

# Get forecast for exactly that many steps
forecast = sarima_results.get_forecast(steps=len(exog_january), exog=exog_january)
In [97]:
# Extract the predicted mean (point forecast)
sarimax_predicted1 = forecast.predicted_mean

# Optionally: Extract confidence intervals
sarimax_conf_int = forecast.conf_int()
In [98]:
test_sarimax.index = pd.to_datetime(test_sarimax.index)
# Make forecast for full test set
forecast = sarima_results.get_forecast(steps=len(test_sarimax), exog=test_sarimax[['dhi', 'temp', 'rhum', 'coco']])
sarimax_predicted = forecast.predicted_mean
sarimax_conf_int = forecast.conf_int()

# Set index of prediction to match test set
sarimax_predicted.index = test_sarimax.index
sarimax_conf_int.index = test_sarimax.index
In [102]:
# Slice for January
start_date = '2025-01-01'
end_date = '2025-02-01'
actual = test_sarimax.loc[start_date:end_date, 'Solar-Power-MW']
forecasted = sarimax_predicted.loc[start_date:end_date]
ci = sarimax_conf_int.loc[start_date:end_date]

# Plot
plt.figure(figsize=(15, 5))
plt.plot(actual.index, actual, label='Actual', color='blue')
plt.plot(forecasted.index, forecasted, label='Forecast', linestyle='--', color='orange')
plt.fill_between(ci.index, ci.iloc[:, 0], ci.iloc[:, 1], color='orange', alpha=0.2, label='Confidence Interval')

plt.title('Solar Generation Forecast vs Actual (Jan 2025)')
plt.xlabel('Date')
plt.ylabel('Solar Power (MW)')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image

Selecting the best model¶

In [116]:
model_scores = []

# Holt-Winters (check if AIC/BIC are available)
try:
    model_scores.append({
        "model": "Holt-Winters",
        "aic": hw_model_mul.aic,
        "bic": hw_model_mul.bic,
        "llf": hw_model_mul.llf
    })
except AttributeError:
    model_scores.append({
        "model": "Holt-Winters",
        "aic": None,
        "bic": None,
        "llf": None
    })

# ARIMA
model_scores.append({
    "model": "ARIMA",
    "aic": arima_model.aic,
    "bic": arima_model.bic,
    "llf": arima_model.llf
})

# ARIMA v2
model_scores.append({
    "model": "ARIMA v2",
    "aic": arima_model1.aic,
    "bic": arima_model1.bic,
    "llf": arima_model1.llf
})

# SARIMA
model_scores.append({
    "model": "SARIMA",
    "aic": results.aic,
    "bic": results.bic,
    "llf": results.llf
})

# SARIMAX
model_scores.append({
    "model": "SARIMAX",
    "aic": sarima_results.aic,
    "bic": sarima_results.bic,
    "llf": sarima_results.llf
})

# Create a summary table
scores_df = pd.DataFrame(model_scores)
scores_df_sorted = scores_df.sort_values(by="aic")
print(scores_df_sorted)
          model            aic            bic            llf
3        SARIMA  533015.638684  533100.273146 -266497.819342
4       SARIMAX  533746.598081  533865.097119 -266859.299040
1         ARIMA  545580.456614  545809.010500 -272763.228307
2      ARIMA v2  564847.491792  564915.211234 -282415.745896
0  Holt-Winters            NaN            NaN            NaN
In [117]:
def plot_model_selection(scores_df):
    models = scores_df['model']

    plt.figure(figsize=(15, 5))

    # AIC
    plt.subplot(1, 3, 1)
    plt.bar(models, scores_df['aic'], color='skyblue')
    plt.title('AIC Comparison')
    plt.ylabel('AIC')
    plt.xticks(rotation=45)
    plt.grid(axis='y')

    # BIC
    plt.subplot(1, 3, 2)
    plt.bar(models, scores_df['bic'], color='salmon')
    plt.title('BIC Comparison')
    plt.ylabel('BIC')
    plt.xticks(rotation=45)
    plt.grid(axis='y')

    # Log-Likelihood
    plt.subplot(1, 3, 3)
    plt.bar(models, scores_df['llf'], color='lightgreen')
    plt.title('Log-Likelihood Comparison')
    plt.ylabel('Log-Likelihood')
    plt.xticks(rotation=45)
    plt.grid(axis='y')

    plt.tight_layout()
    plt.show()

# Run the plot
plot_model_selection(scores_df_sorted)
No description has been provided for this image
In [123]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Check if all forecasts are aligned with the test series (Ensure consistent length)
def align_forecast(test_series, forecast):
    if len(forecast) != len(test_series):
        forecast = forecast[:len(test_series)]
    return forecast

# Apply alignment to all forecasts
hw_forecast = align_forecast(test_series, hw_forecast)
arima_forecast = align_forecast(test_series, arima_model_forecast)
arima_forecast1 = align_forecast(test_series, arima_model1_forecast)
sarima_forecast_filled = align_forecast(test_series, sarima_forecast_mean)
sarimax_pred = align_forecast(test_series, forecasted)

# Calculate MAE and RMSE for each model

# Holt-Winters
mae_hw = mean_absolute_error(test_series, hw_forecast)
rmse_hw = np.sqrt(mean_squared_error(test_series, hw_forecast))
print(f"Holt-Winters → MAE: {mae_hw:.2f}, RMSE: {rmse_hw:.2f}")

# ARIMA
mae_arima = mean_absolute_error(test_series, arima_model_forecast)
rmse_arima = np.sqrt(mean_squared_error(test_series, arima_model_forecast))
print(f"ARIMA → MAE: {mae_arima:.2f}, RMSE: {rmse_arima:.2f}")

# ARIMA v2
mae_arima2 = mean_absolute_error(test_series, arima_model1_forecast)
rmse_arima2 = np.sqrt(mean_squared_error(test_series, arima_model1_forecast))
print(f"ARIMA v2 → MAE: {mae_arima2:.2f}, RMSE: {rmse_arima2:.2f}")

# SARIMA
mae_sarima = mean_absolute_error(test_series, sarima_forecast_mean)
rmse_sarima = np.sqrt(mean_squared_error(test_series, sarima_forecast_mean))
print(f"SARIMA → MAE: {mae_sarima:.2f}, RMSE: {rmse_sarima:.2f}")

# SARIMAX
mae_sarimax = mean_absolute_error(actual, forecasted)
rmse_sarimax = np.sqrt(mean_squared_error(actual, forecasted))
print(f"SARIMAX → MAE: {mae_sarimax:.2f}, RMSE: {rmse_sarimax:.2f}")

# Storing results in a summary table
forecast_results = [
    {"model": "Holt-Winters", "mae": mae_hw, "rmse": rmse_hw},
    {"model": "ARIMA", "mae": mae_arima, "rmse": rmse_arima},
    {"model": "ARIMA v2", "mae": mae_arima2, "rmse": rmse_arima2},
    {"model": "SARIMA", "mae": mae_sarima, "rmse": rmse_sarima},
    {"model": "SARIMAX", "mae": mae_sarimax, "rmse": rmse_sarimax},
]

# Create the DataFrame for comparison
import pandas as pd
forecast_df = pd.DataFrame(forecast_results)
forecast_df_sorted = forecast_df.sort_values(by="rmse")

# Print the sorted summary table
print("\nModel Comparison (Sorted by RMSE):")
print(forecast_df_sorted)
Holt-Winters → MAE: 2010.51, RMSE: 4651.62
ARIMA → MAE: 4462.13, RMSE: 5076.36
ARIMA v2 → MAE: 4868.37, RMSE: 5744.13
SARIMA → MAE: 1661.47, RMSE: 2754.89
SARIMAX → MAE: 1662.00, RMSE: 2756.35

Model Comparison (Sorted by RMSE):
          model          mae         rmse
3        SARIMA  1661.469875  2754.888877
4       SARIMAX  1662.000945  2756.348690
0  Holt-Winters  2010.507467  4651.615649
1         ARIMA  4462.128060  5076.361625
2      ARIMA v2  4868.366377  5744.133398
In [ ]:
 
In [124]:
# Create the DataFrame for comparison
forecast_df = pd.DataFrame(forecast_results)

# Plotting MAE and RMSE for each model
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Bar plot for MAE
ax[0].bar(forecast_df_sorted['model'], forecast_df_sorted['mae'], color='blue', alpha=0.7)
ax[0].set_title('Model Comparison - MAE')
ax[0].set_xlabel('Model')
ax[0].set_ylabel('MAE')
ax[0].tick_params(axis='x', rotation=45)

# Bar plot for RMSE
ax[1].bar(forecast_df_sorted['model'], forecast_df_sorted['rmse'], color='green', alpha=0.7)
ax[1].set_title('Model Comparison - RMSE')
ax[1].set_xlabel('Model')
ax[1].set_ylabel('RMSE')
ax[1].tick_params(axis='x', rotation=45)

# Adjust layout
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Optional LSTM¶

In [70]:
#pip install tensorflow
In [10]:
train_series = df_modeling[df_modeling.index < "2025-01-01"]
test_series = df_modeling[df_modeling.index >= "2025-01-01"]
In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assuming you have two time series: train_series and test_series
scaler = MinMaxScaler()

# Scaling the training and testing data
train_scaled = scaler.fit_transform(train_series.values.reshape(-1, 1))
test_scaled = scaler.transform(test_series.values.reshape(-1, 1))

# Function to create sequences for LSTM
def create_sequences(data, window_size):
    X, y = [], []
    for i in range(window_size, len(data)):
        X.append(data[i - window_size:i])  # Creating the input sequence
        y.append(data[i])  # Creating the corresponding target value
    return np.array(X), np.array(y)

# Window size parameter (number of past hours to consider)
window_size = 24

# Create X and y for training and testing
X_train, y_train = create_sequences(train_scaled, window_size)
X_test, y_test = create_sequences(test_scaled, window_size)
In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Create the model
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(window_size, 1)))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mse')
model.summary()

# Set up EarlyStopping to prevent overfitting
early_stop = EarlyStopping(
    monitor='val_loss',     # Monitor the validation loss
    patience=3,             # If validation loss doesn't improve for 3 epochs, stop training
    restore_best_weights=True  # Restore the best model weights when stopping
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stop],  # Add early stopping callback
    verbose=1
)
2025-05-11 18:27:24.501562: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ lstm (LSTM)                     │ (None, 64)             │        16,896 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 1)              │            65 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 16,961 (66.25 KB)
 Trainable params: 16,961 (66.25 KB)
 Non-trainable params: 0 (0.00 B)
Epoch 1/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 26s 6ms/step - loss: 0.0023 - val_loss: 5.0423e-05
Epoch 2/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 24s 6ms/step - loss: 1.9359e-05 - val_loss: 2.2556e-05
Epoch 3/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 24s 6ms/step - loss: 1.1608e-05 - val_loss: 1.1977e-05
Epoch 4/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 24s 6ms/step - loss: 8.4773e-06 - val_loss: 1.3506e-05
Epoch 5/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 27s 6ms/step - loss: 7.4589e-06 - val_loss: 1.3186e-05
Epoch 6/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 28s 6ms/step - loss: 6.7534e-06 - val_loss: 7.6597e-06
Epoch 7/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 25s 6ms/step - loss: 6.5524e-06 - val_loss: 1.2351e-05
Epoch 8/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 25s 6ms/step - loss: 6.0687e-06 - val_loss: 6.4043e-06
Epoch 9/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 25s 6ms/step - loss: 6.0242e-06 - val_loss: 8.6305e-06
Epoch 10/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 26s 6ms/step - loss: 5.8956e-06 - val_loss: 9.0492e-06
Epoch 11/15
4383/4383 ━━━━━━━━━━━━━━━━━━━━ 27s 6ms/step - loss: 5.4897e-06 - val_loss: 6.7543e-06
In [13]:
import matplotlib.pyplot as plt

# Plot the training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Training vs Validation Loss')
plt.legend()  
plt.grid(True)  
plt.show()  
No description has been provided for this image
In [14]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np

# Predict using the trained model
pred_scaled = model.predict(X_test)

# Inverse transform to original scale
pred_lstm = scaler.inverse_transform(pred_scaled)

# Create forecast index to align with predictions
forecast_index = test_series.index[window_size:]
lstm_forecast = pd.Series(pred_lstm.flatten(), index=forecast_index)

# Ensure indices are datetime objects
test_series.index = pd.to_datetime(test_series.index)
lstm_forecast.index = pd.to_datetime(lstm_forecast.index)

# Plot actual vs forecast
plt.figure(figsize=(14, 6))
plt.plot(test_series.index, test_series, label="Actual 2025", alpha=0.8)
plt.plot(lstm_forecast.index, lstm_forecast, label="LSTM Forecast", linestyle='--', color='purple')
plt.title("LSTM Forecast vs Actual (2025)")
plt.xlabel("Date")
plt.ylabel("Solar Power (MW)")
plt.legend()
plt.grid(True)
plt.tight_layout()

# Zoom in on a specific date range (adjust as needed)
start_date = pd.to_datetime('2025-01-01')
end_date = pd.to_datetime('2025-02-01')
plt.xlim(start_date, end_date)

plt.show()

# Calculate error metrics
true_values = test_series[window_size:]  # Align with X_test sequences
mae_lstm = mean_absolute_error(true_values, lstm_forecast)
rmse_lstm = np.sqrt(mean_squared_error(true_values, lstm_forecast))

print(f"LSTM → MAE: {mae_lstm:.2f}, RMSE: {rmse_lstm:.2f}")
323/323 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step
No description has been provided for this image
LSTM → MAE: 72.16, RMSE: 130.60
In [ ]: